Social Network
In this take-home exercise, we reveal the patterns of life in Ohio, USA by creating data visualization with tmap.
With reference to point 2 in Challenge 1 of VAST Challenge 2022, the following questions will be addressed:
Consider the social activities in the community.
Before we start to draw graphs, there are some work to do:
To draw social network plots, I use igraph.
devtools::install_github("itsleeds/od", build_vignettes = TRUE)
packages = c('ggiraph', 'plotly', 'tidyverse', 'DT','gganimate',
'knitr', 'ggdist', 'scales', 'grid', 'gridExtra',
'patchwork','ggsignif','gghighlight',"hrbrthemes",
'readxl', 'gifski', 'gapminder','treemap', 'treemapify',
'rPackedBar','ggridges','rmarkdown','crosstalk',
'd3scatter','tidycensus','timetk','ggseas','lubridate',
'ggrepel','doSNOW','data.table','ViSiElse','sf','tmap',
'clock','dplyr','od','igraph', 'tidygraph', 'ggstatsplot',
'ggraph', 'visNetwork', 'lubridate', 'clock',
'tidyverse', 'graphlayouts','FunnelPlotR', 'plotly', 'knitr')
for(p in packages) {
if(!require(p, character.only = T)) {
install.packages(p)
}
library(p, character.only = T)
}
The data sets used in this take home exercise is from the social network journals of participants in Ohio City.
There are two data sets. One contains the nodes data and the other contains the edges (also know as link) data.
participants <- read_csv("./raw_data/Attributes/Participants.csv")
social_network <- read_csv("./raw_data/Journals/SocialNetwork.csv")
financial <- read_csv("./raw_data/Journals/FinancialJournal.csv")
For participants:
wage for each participants by joining with
Financial journey.For social graph:
For friends num:
participants$educationLevel<-factor(participants$educationLevel,ordered=TRUE,levels=c('Low','HighSchoolOrCollege',"Bachelors","Graduate"))
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30','31-40', '41-50', '51-60', '>60')
participants$Age_Group <- cut(participants$age, breaks=brks, labels = grps, right = FALSE)
brks <- c(0, 0.3, 0.5, 0.6, 1)
grps <- c('Really Sad', 'Sad','Neutral', 'Happy')
participants$Joviality_Group <- cut(participants$joviality, breaks=brks, labels = grps, right = FALSE)
income_par <- financial %>%
filter(category %in% c('Wage')) %>%
group_by(participantId,month=lubridate::month(timestamp)) %>%
summarise(wage = round(sum(amount),1)) %>%
ungroup()%>%
group_by(participantId) %>%
summarise(wage = mean(wage)) %>%
ungroup()
participants <- participants %>%
inner_join(income_par, by = "participantId")
socialNetwork_edges <- social_network %>%
group_by(from=participantIdFrom, to=participantIdTo) %>%
filter(from!=to) %>%
summarise(weight = n()) %>%
filter(weight > 1) %>%
ungroup()
parId_in_socialNetwork <- union(unique(socialNetwork_edges$from),unique(socialNetwork_edges$to)) %>%
sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
socialNetwork_nodes <- participants %>%
inner_join(parId_in_socialNetwork, by = "participantId")
socialNetwork_nodes$id <- socialNetwork_nodes$participantId
socialNetwork_graph <- igraph::graph_from_data_frame(socialNetwork_edges,
vertices = socialNetwork_nodes)%>%
as_tbl_graph()
friends_num_df <- socialNetwork_edges %>%
group_by(from) %>%
filter(from!=to) %>%
group_by(participantId = from) %>%
summarise(friends_num = n()) %>%
ungroup() %>%
inner_join(participants, by = "participantId")
interaction_num_df <- socialNetwork_edges %>%
group_by(participantId = from) %>%
filter(participantId!=to) %>%
summarise(interaction_num = sum(weight)) %>%
ungroup() %>%
inner_join(participants, by = "participantId")
top5_most_active<-interaction_num_df %>%
arrange(desc(interaction_num)) %>%
slice(1:5)
top5_most_active$id <- top5_most_active$participantId
top5_most_active_nodes <- top5_most_active
top5_most_active_edges <- social_network %>%
group_by(from=participantIdFrom, to=participantIdTo) %>%
filter((from!=to)&
(from %in% top5_most_active$id)) %>%
summarise(weight = n()) %>%
filter(weight > 1) %>%
ungroup()
parId_in_socialNetwork <- union(unique(top5_most_active_edges$from),unique(top5_most_active_edges$to)) %>%
sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
top5_most_active_nodes <- participants %>%
inner_join(parId_in_socialNetwork, by = "participantId")
top5_most_active_nodes$id <- top5_most_active_nodes$participantId
top5_most_active_graph <- igraph::graph_from_data_frame(top5_most_active_edges,
vertices = top5_most_active_nodes)%>%
as_tbl_graph()
write_rds(top5_most_active_graph, './data/top5_most_active_graph.rds')
write_rds(top5_most_active_nodes, './data/top5_most_active_nodes.rds')
write_rds(top5_most_active_edges, './data/top5_most_active_edges.rds')
write_rds(interaction_num_df,'./data/interaction_num.rds')
write_rds(socialNetwork_graph,'./data/socialNetwork_graph.rds')
write_rds(socialNetwork_nodes,'./data/socialNetwork_nodes.rds')
write_rds(socialNetwork_edges,'./data/socialNetwork_edges.rds')
write_rds(friends_num_df,'./data/friends_num.rds')
top5_most_active_graph <- read_rds('./data/top5_most_active_graph.rds')
top5_most_active_nodes <- read_rds('./data/top5_most_active_nodes.rds')
top5_most_active_edges <- read_rds('./data/top5_most_active_edges.rds')
interaction_num_df <- read_rds('./data/interaction_num.rds')
social_network <- read_csv("./raw_data/Journals/SocialNetwork.csv")
socialNetwork_graph <- read_rds('./data/socialNetwork_graph.rds')
socialNetwork_nodes <- read_rds('./data/socialNetwork_nodes.rds')
socialNetwork_edges <- read_rds('./data/socialNetwork_edges.rds')
friends_num_df <- read_rds('./data/friends_num.rds')
Suppose the first thing we want to inspect is the distribution of the number of social interactions for participants of different education levels. We also want to know if the mean differences in the number of social interaction between different education levels is statistically significant.
I apply ANOVA test to see if there’s any relationship between social interaction tims and people’s education level. We can see that there’s a huge difference between the median social interaction times within different groups. As we can see the median social activeness is positively correlated with degree level. People with higher degree is more active.
I plotted a scatter plot of the social zone size and people’s wage. Surprisingly, the rich people do not have a big social zone.
p2 <- ggplot(data=friends_num_df, aes(x = wage,
y = friends_num,
text =paste("Wage:", round(wage,2),
"\nNo. of Friends:",friends_num)))+
geom_point(aes(size=friends_num,color=friends_num), alpha = 1/10) +
labs(y= 'No. of Interacted People', x= 'Wage',
title = "Fig2: Relatiobship between wage and social zone size",
subtitle = "People who has high wage tend to keep a smalll social zone")
ggplotly(p2,tooltip = c("text"))
As we can see, people who are happy has a strong connection with each other.
ggraph(top5_most_active_graph,
layout = "fr") + # random
geom_edge_link(aes(width=weight,alpha=0.2)) +
geom_node_point(aes(color=Joviality_Group,
size = 0.3)) +
theme_void() +# remove gray background +
facet_nodes(~Joviality_Group)
top5_edges_aggregated <- top5_most_active_edges %>%
left_join(top5_most_active_nodes, by = c("from" = "id")) %>%
rename(from_JP = Joviality_Group) %>%
left_join(top5_most_active_nodes, by = c("to" = "id")) %>%
rename(to_JP = Joviality_Group) %>%
group_by(from, to) %>%
summarise(weight = n()) %>%
filter(from!=to) %>%
filter(weight > 1) %>%
ungroup()
top5_most_active_nodes <- top5_most_active_nodes %>%
rename(group = Joviality_Group)
visNetwork(top5_most_active_nodes,
top5_edges_aggregated) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visLegend() %>%
visLayout(randomSeed = 123)